#Goal Plot the integrations (beeswarm plots), expression scores and actual expression for the Sox2P reporter hopping pools and clones. This is figure 2 and figure S2 of the manuscript. Because we load all the clonal data here, we also plot figure 5A (which is based on the same clones)
To re-run this code, change the paths in ‘File paths’ to the correct location of datasets. The expression scores are bootstrapped 5000 times, which takes a while to run. Therefore, these chunks are commented out and the bootstrapped data is loaded from a local RDS file. To re-run those, run the bootstrapped expression score calculation once and save the result locally.
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse() masks IRanges::collapse()
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ dplyr::desc() masks IRanges::desc()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename() masks S4Vectors::rename()
## ✖ lubridate::second() masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice() masks IRanges::slice()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# library(gtools)
library(tidyverse)
# library(dtplyr)
library(dplyr)
# library(tidyr)
library(ggbeeswarm)
library(ggplot2)
library(Biostrings)## Loading required package: XVector
##
## Attaching package: 'XVector'
##
## The following object is masked from 'package:purrr':
##
## compact
##
##
## Attaching package: 'Biostrings'
##
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
## Loading required package: viridisLite
##
## Attaching package: 'caTools'
##
## The following object is masked from 'package:IRanges':
##
## runmean
##
## The following object is masked from 'package:S4Vectors':
##
## runmean
library(readxl)
import::from(flowCore, .except=c("filter")) #I don't load the whole flowCore and ggcyto libraries, because they overwrite dplyr's filter function :0
import::from(ggcyto, 'fortify_fs')
library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GENOVA'
##
## The following object is masked from 'package:ggplot2':
##
## resolution
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
#datatag
datetag = paste0("CM",format(Sys.time(), '%Y%m%d'))
# Prepare output
output_dir <- paste0("/DATA/projects/Sox2/Figure_reporter_only_hopping/analysis_",datetag)
dir.create(output_dir, showWarnings = FALSE)library(knitr)
opts_chunk$set(cache = T,
message = F,
warning = F,
dev=c('png', 'pdf'),
dpi = 600,
fig.path = file.path(output_dir, "figures/"),
cache.lazy = F)
pdf.options(useDingbats = FALSE)path_CTCF_sites = "/DATA/usr/v.franceschini/Workspaces/2024_01_MATHIAS_CTCF_PEAKS/VF240812_calling_CTCF_motifs_again/02_OUTPUTS/CTCF_sites/6764_2_ME_E2_CGATGT_S2_R1_001_peaks_motifs.bed"
path_tib_23_ints = "/DATA/projects/Sox2/GEO_collection/Tagmentation/mapped_integrations_-116kb_Sox2P_pools.txt"
path_tib_C9_ints = "/DATA/projects/Sox2/GEO_collection/Tagmentation/mapped_integrations_-161kb_Sox2P_pools.txt"
path_median_mTurq_relevant_cell_lines = "/DATA/projects/Sox2/Figure_reporter_only_hopping/CM20240814_median_mTurq_sorting_for_paper.tsv"
path_RCMC_WT_file = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/CM20230718_GSM6281849_RCMC_BR1_merged_allCap_WT_mm39.merged.50.mcool"
path_mm10_to_mm39 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm39.over.chain"
path_mm39_to_mm10 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm39ToMm10.over.chain"
paths_ext_data_files = c(
ATAC = '/DATA/usr/c.moene/public_data/ATAC/GSE98390_E14_ATAC_MERGED.DANPOS.bw',
H3K27ac = '/DATA/usr/c.moene/public_data/ChIP/Histons/H3K27ac_ENCFF583WVZ_signal_pval.bigWig',
CTCF = '/DATA/projects/Sox2/CTCF_ChIP_analysis/results/bigwig/6764_2_ME_E2_CGATGT_S2_R1_001.bw' #new, aligned to mm10 only
)
diva_xml_file_E2555_sorting = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2534_hopping_C9_LP/FACS/Mathias (BvS)20240501_E2534_rep1.xml"
path_FACS_sorting_E2555 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2534_hopping_C9_LP/FACS"
diva_xml_file_E2096_sorting = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2096_hopping_mTurq_6gates/fcs_for_R_sorting/Diva_export/Mathias (BvS) 20221117_mTurq/Mathias (BvS) 20221117_mTurq.xml"
path_FACS_sorting_E2096 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2096_hopping_mTurq_6gates/fcs_for_R_sorting/Diva_export/Mathias (BvS) 20221117_mTurq"
path_insertion_clones_E2221 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2344_mapping_clones/CM20231218_E2221_locations_all_mapped_clones.rds"
path_link_well_clonename_E2221 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2217_FACS_clones/CM20230309_E2204_link_wellindex_clonename.txt"
path_fcs_files_E2221 = '/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2217_FACS_clones/gated_fcs_files/plate_reader'
path_WT_ctrl_E2250 = "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/E2250_CRISPR_Sox2_delBclones_pools/fcs_for_R/export_ME20230418_E2250_pools_WT_001_Single Cells.fcs"
#functions
source("/DATA/projects/Sox2/General_functions/CM20240621_general_plotting_functions.R")
source("/DATA/projects/Sox2/General_functions/CM20240626_expression_score_functions.R")From the combined results table
#load the -116kb data (LP23)
tib_23_all_data = read_tsv(path_tib_23_ints) %>%
filter(chr %in% paste0("chr", c(1:19, "X"))) %>%
mutate(chr = factor(chr, levels = paste0("chr", c(1:19, "X"))))
#the control pool in E2285 (rep3) was sequenced as 10 separate libraries, combine them into one sample
tib_23_E2285_ctrls = tib_23_all_data %>%
filter(replicate == "rep3" & insert == "Sox2P" & sample_type == "control pool") %>%
mutate(sample_name = "Sox2P-reporter -116 kb, unsorted control pool rep3")%>%
group_by(chr, start, end, seq, strand, replicate, experiment, launch_pad_location, insert, genotype, sample_type, sorted_gate, pool_nr, sample_name) %>%
summarize(read_count = sum(read_count),
read_count_1 = sum(read_count_1),
read_count_2 = sum(read_count_2), .groups = "keep")
tib_23 = tib_23_all_data %>%
#select the rest of the data
filter(!(replicate == "rep3" & insert == "Sox2P" & sample_type == "control pool")) %>%
select(chr, start, end, seq, strand, replicate, experiment, launch_pad_location, insert, genotype, sample_type, sorted_gate, pool_nr, sample_name,
read_count, read_count_1, read_count_2)%>%
#add merged controls back in
bind_rows(tib_23_E2285_ctrls) %>%
#mark hopped integrations
mutate(hopped = start != 34643961)
#load the -161kb data (LP C9)
tib_C9 = read_tsv(path_tib_C9_ints) %>%
filter(chr %in% paste0("chr", c(1:19, "X"))) %>%
mutate(chr = factor(chr, levels = paste0("chr", c(1:19, "X")))) %>%
#combine the 2 libraries of "Sox2P-reporter -161 kb, population P1 pool1 rep2"
mutate(to_merge = case_when(replicate == "rep2" & insert == "Sox2P" & sorted_gate == "P1" ~ "merge",
.default = sample_name)) %>%
mutate(sample_name = case_when(to_merge == "merge" ~ "Sox2P-reporter -161 kb, population P1 pool1 rep2",
.default = sample_name)) %>%
group_by(chr, start, end, seq, strand, replicate, experiment, launch_pad_location, insert, genotype, sample_type, sorted_gate, pool_nr, sample_name) %>%
summarize(read_count = sum(read_count),
read_count_1 = sum(read_count_1),
read_count_2 = sum(read_count_2), .groups = "keep") %>%
#mark hopped integrations
mutate(hopped = start != 34598479)
#filter the integrations
tib_filtered <- bind_rows(tib_23,tib_C9) %>%
#add variables used in the plotting (old names of cell lines etc)
mutate(population = case_when(sample_type == "control pool" ~ "ctrl",
.default = sorted_gate),
cell_line = case_when(launch_pad_location == "-116 kb" & insert == "Sox2P" ~ "23_34A",
launch_pad_location == "-161 kb" & insert == "Sox2P" ~ "C9_mCh_34"),
landing_pad = case_when(launch_pad_location == "-116 kb" ~ "23",
launch_pad_location == "-116 kb" ~ "C9")) %>%
#annotate confidence of mapping (one or both ITRs)
mutate(mapped_arms = case_when(read_count_1 == 0 ~ "rv_only",
read_count_2 == 0 ~ "fw_only",
read_count_1 > 0 & read_count_2 > 0 ~ "both_arms")) %>%
#filter the data
filter(population == "ctrl" | read_count >= 2) %>% #for ctrl distribution 1 mapping read is enough
filter(!(start >= 34721183 & start <= 34721192)) %>% #remove contamination from LP8
filter(! start %in% c(25018987, 35232946) ) %>% #remove contamination clone C1
filter(strand %in% c("+", "-"))
tib_filtered_P1_strict = tib_filtered %>%
filter(!(population == "P1" & mapped_arms != "both_arms")) #only keep P1 integrations with 2 mapped arms#load median expression values (from sorting pdfs)
median_sorted_mTurq = read_tsv(path_median_mTurq_relevant_cell_lines)
#get P1 expression from 23_34A for C9
mT_23_34A_P1 = median_sorted_mTurq %>%
filter(population == "P1" & cell_line == "23_34A" & mCherry_selection == "mCh+") %>%
pull(mTurq) %>% mean()
#combine and add annotations
median_sorted_mTurq_tib = median_sorted_mTurq %>%
#add P1 from 23_34A to C9_34
mutate(mTurq = case_when(cell_line == "C9_mCh_34" & population == "P1" & is.na(mTurq) ~mT_23_34A_P1,
.default = mTurq)) %>%
dplyr::rename(experiment_sorting = experiment) %>%
mutate(experiment =
case_when(experiment_sorting == "E2096" ~ "E2138", # 23_34A (rep1)
experiment_sorting == "E2270" ~ "E2282", # 23_34A (rep2)
experiment_sorting == "E2259" ~ "E2285", # 23_34A (rep3)
.default = experiment_sorting)) %>%
mutate(replicate = case_when(experiment == "E2138" ~ "rep1",
experiment == "E2282" ~ "rep2",
experiment == "E2285" ~ "rep3",
.default = replicate)) %>%
select(-experiment_sorting)
#add mTurq values to the integration tibble
tib_filtered_mTurq_vals = left_join(tib_filtered_P1_strict, median_sorted_mTurq_tib) %>%
filter(population != "ctrl") %>% # & !is.na(population)
rename(mTurq_expr_gate = mTurq) #some setups
all_exp = unique(tib_filtered_P1_strict$experiment)
# Location of enhancer / gene
enhancer <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34753415,
end = 34766401),
strand = "*")
Sox2_gene <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34649995,
end = 34652461),
strand = "+")
landingPad_23 <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34643960,
end = 34643962),
strand = "+")
landingPad_C9 <- GRanges(seqnames = "chr3",
ranges = IRanges(start = 34598479,
end = 34598480),
strand = "+")
LP_tib = tibble(landing_pad = c("LP23", "LPC9"),
start = c( start(landingPad_23),start(landingPad_C9)))
#CTCF sites based on our own mapping
# new analysis (based on mapping to mm10 only)
CTCF_mm10_new <- import.bed(path_CTCF_sites)
#filtering:
CTCF_mm10.chr3 <- CTCF_mm10_new[seqnames(CTCF_mm10_new)== 'chr3']
# add the missing site after the SCR
CTCF_mm10.chr3_extra = sort(c(CTCF_mm10.chr3,
GRanges(seqnames = "chr3",
IRanges(start = 34772210, end = 34772210),
strand = "+")),
ignore.strand = T)General colors and ranges
# Plotting ranges
prange_zoom = c(34.5E6, 35E6)
prange_zoom_paper = c(34.59E6, 34.83E6)
prange_SCR = c(34.725E6, 34.785E6)
# Color annotations
## SCR
brown = "#ffb000"
#color schemes
myCols_blue = c('#525252', "#080E5B", "#212B71", "#3A4987", "#465893", "#5A719A", "#6C8AA0")
names(myCols_blue) = c("ctrl", paste0("P", 1:6))
colScale7_blue <- scale_colour_manual(name = "population", values = myCols_blue)
fillScale7_blue <- scale_fill_manual(name = "population", values = myCols_blue)
myCols_blue_strand = c( "#080E5B", "#5A719A")
names(myCols_blue_strand) = c("+","-")
colScale2_blue <- scale_colour_manual(name = "strand", values = myCols_blue_strand)
fillScale2_blue <- scale_fill_manual(name = "strand", values = myCols_blue_strand)plot_beeswarm_only = function(TIB, CL = "23_34A", EXP = all_exp, P_RANGE = prange_zoom,
PLOT_CTCF = F, PLOT_ANNOT = T,
TITLE = NULL, POINT_SIZE = 0.75, RANDOM_METHOD = "quasirandom",
NEXT_RANGE = NULL){
tib_for_plot = filter(TIB, hopped & chr == "chr3" & cell_line %in% CL & experiment %in% EXP & start >= P_RANGE[1] & start <= P_RANGE[2])
LP_tib_cl = select(tib_for_plot, c(cell_line, landing_pad)) %>%
distinct() %>% left_join(LP_tib)
ggplot(filter(tib_for_plot, hopped & chr == "chr3" & cell_line %in% CL & experiment %in% EXP),
aes(x = start, y = population, col = population)) +
{if(length(CL)>1) facet_grid(cell_line ~ .)} + #only facet when necessary
{if (PLOT_ANNOT)
list( #add elements in a list, you cannot use + inside an if statement
# annotate enhancer
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf, alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') )} +
{if(PLOT_ANNOT & length(CL)== 1)
geom_vline(xintercept = pull(
filter(LP_tib, landing_pad %in% unique(tib_for_plot$landing_pad)),
start),
col = 'darkgrey')} +
{if(PLOT_ANNOT & length(CL) > 1)
geom_vline(data = LP_tib_cl, aes(xintercept = start), col = 'darkgrey')} +
# # annotate CTCFs
{if(PLOT_CTCF)geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey', lty="11")}+
{if(!is.null(NEXT_RANGE)) geom_rug(data = tibble(range = NEXT_RANGE),
aes(x = range),
fill = 'grey',
inherit.aes = F)}+
#annotate landing pad
labs(x='Genomic coordinate (Mb)',y='sorted gate') +
#datapoints
geom_quasirandom(alpha = 0.5, size = POINT_SIZE, varwidth = T, bandwidth = 0.8,
# shape = 16, #changes to point without border, so alpha applies to the whole point
method = RANDOM_METHOD,
stroke = NA) +
geom_text(data = plyr::count(tib_for_plot, vars = c("cell_line", "population")), aes(label = paste("n = ", freq, sep = ""),
x = Inf),
hjust = "inward", vjust = "top",
col = 'black') +
#layout:
theme_classic(base_size = 16) +
theme(legend.position = "none") +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=P_RANGE, expand=c(0,0)) +
scale_y_discrete(limits = rev) +
ggtitle(TITLE) +
colScale7_blue
}range_2Mb = c(33.643960E6, 35.643960E6)
plot_beeswarm_only(TIB = filter(tib_filtered_P1_strict, hopped & chr == "chr3" & experiment %in% all_exp),
CL = "23_34A",
PLOT_ANNOT = T,
PLOT_CTCF = F,
P_RANGE = range_2Mb,
POINT_SIZE = 1.5
)plot_beeswarm_only(TIB = filter(tib_filtered_P1_strict, hopped & chr == "chr3" & experiment %in% all_exp),
CL = "C9_mCh_34",
PLOT_ANNOT = T,
PLOT_CTCF = F,
P_RANGE = c(33.5E6, 36E6),
POINT_SIZE = 1.5
)LP23 and C9
plot_beeswarm_only(TIB = filter(tib_filtered_P1_strict, hopped & chr == "chr3" & experiment %in% all_exp ),
CL = c("23_34A", "C9_mCh_34"),
PLOT_ANNOT = T,
PLOT_CTCF = T,
P_RANGE = prange_zoom_paper,
POINT_SIZE = 1.5
)Plotting triangles in a pdf is a nightmare, so instead 1 just make one plot with the CTCF lines colored by orientation and based on that add the CTCF triangles manually to the paper figures.
ggplot() +
# annotate enhancer
# annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown)+
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown)+
# annotate gene
# annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf, alpha = .2, fill = 'red')+
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
geom_vline(data = as_tibble(CTCF_mm10.chr3_extra), aes(xintercept = start, col = strand), size = 2, lty = 'dashed') +
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=prange_zoom_paper, expand=c(0,0)) +
theme_bw()Calculate (bootstrapped) expression score
# calculate running mean on random sample of same size, 5000 times
# set.seed(20231012)
# expr_score_tib = create_bootstrap_tibble(FUN = function(TIB){expr_score_function(TIB,
# binsize = 1000,
# N_bins_window = 10,
# calculation_window = prange_zoom,
# CL = "23_34A", EXP = all_exp)},
# TIB = tib_filtered_mTurq_vals,
# N_resamples = N_bootstrap)
# saveRDS(expr_score_tib, "/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240408_expression_score_reporter_only.rds")
expr_score_tib_23_34A = readRDS("/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240408_expression_score_reporter_only.rds")
expr_score_sum_tib_23_34A = bootstrap_summary_fun(expr_score_tib_23_34A,
score_column = "expr_score_window",
conf_int = 0.95)
#calculate actual expr score (for filtering # of ints per window)
expr_score_23_34A =
expr_score_function(tib_filtered_mTurq_vals,
binsize = 1000,
N_bins_window = 10,
calculation_window = prange_zoom,
CL = c("23_34A"),
EXP = all_exp)
min_ints_window = 3
expr_score_sum_tib_23_34A_filt = left_join(expr_score_sum_tib_23_34A, expr_score_23_34A) %>%
mutate(expr_score_window_median_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_median,
.default = NA),
expr_score_window_lower_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_lower,
.default = NA),
expr_score_window_upper_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_upper,
.default = NA))expr_plot_23_34A = ggplot(expr_score_sum_tib_23_34A_filt, aes(x = mid_interval)) +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_23), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')
) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower_sel, ymax = expr_score_window_upper_sel), alpha = 0.2, fill = "#080E5B") +
# geom_line( aes(y = expr_score_window_mean), col = 'orange') +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median_sel), col = "#080E5B") +
#axes
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=prange_zoom_paper, expand=c(0,0)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=6),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'none')
expr_plot_23_34ACalculate (bootstrapped) expression score
# calculate running mean on random sample of same size, 500 times
# set.seed(20240705)
# expr_score_tib_C9 = create_bootstrap_tibble(FUN = function(TIB){expr_score_function(TIB,
# binsize = 500,
# N_bins_window = 10,
# calculation_window = prange_zoom,
# CL = "C9_mCh_34", EXP = all_exp)},
# TIB = tib_filtered_mTurq_vals,
# N_resamples = N_bootstrap)
# saveRDS(expr_score_tib_C9, "/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_reporter_only_C9_mChpos_34.rds")
expr_score_tib_C9= readRDS("/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_reporter_only_C9_mChpos_34.rds")
expr_score_sum_tib_C9 = bootstrap_summary_fun(expr_score_tib_C9,
score_column = "expr_score_window",
conf_int = 0.95)
#calculate actual expr score (for filtering # of ints per window)
expr_score_C9 =
expr_score_function(tib_filtered_mTurq_vals,
binsize = 500,
N_bins_window = 10,
calculation_window = prange_zoom,
CL = c("C9_mCh_34"),
EXP = all_exp)
min_ints_window = 3
expr_score_sum_tib_C9_filt = left_join(expr_score_sum_tib_C9, expr_score_C9) %>%
mutate(expr_score_window_median_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_median,
.default = NA),
expr_score_window_lower_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_lower,
.default = NA),
expr_score_window_upper_sel = case_when(
total_ints_window >= min_ints_window ~ expr_score_window_upper,
.default = NA))expr_plot_C9 = ggplot(expr_score_sum_tib_C9_filt, aes(x = mid_interval)) +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_C9), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')
) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower_sel, ymax = expr_score_window_upper_sel), alpha = 0.2, fill = "#080E5B") +
# geom_line( aes(y = expr_score_window_mean), col = 'orange') +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median_sel), col = "#080E5B") +
#axes
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=prange_zoom_paper, expand=c(0,0)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=6),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'none')
expr_plot_C9Load RCMC data
Calculate virtual 4C in 1kb bins, from the SCR
# download.file('https://hgdownload.soe.ucsc.edu/goldenPath/mm39/liftOver/mm39ToMm10.over.chain.gz',
# "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm39ToMm10.over.chain.gz")
# download.file('https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToMm39.over.chain.gz',
# "/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm39.over.chain.gz")
# library(R.utils)
# gunzip("/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm39ToMm10.over.chain.gz")
# gunzip("/DATA/usr/c.moene/projects/Hopping_Sox2_mTurq/general_analyses/region_capture_microC/mm10ToMm39.over.chain.gz")
chain_mm10_to_mm39 = import.chain(path_mm10_to_mm39)
chain_mm39_to_mm10 = import.chain(path_mm39_to_mm10)
prange_paper_mm39 = liftOver(GRanges(seqnames = "chr3", IRanges(start = prange_zoom_paper[1], end = prange_zoom_paper[2]) ), chain_mm10_to_mm39)
#core SCR
core_SCR_mm10 = GRanges("chr3:34757573-34762064") #brosh et al bins 24 & 26, the 2 essential elements in the SCR
core_SCR_mm39 = liftOver(core_SCR_mm10, chain_mm10_to_mm39)
SCR_virt4C_mm39_core = virtual_4C(RCMC_1kb, unlist(core_SCR_mm39),
c(start(unlist(prange_paper_mm39)), end(unlist(prange_paper_mm39)) ))
SCR_virt4C_mm39_GR_core= SCR_virt4C_mm39_core$data %>% mutate(start = mid - 500,
end = mid + 500) %>% GRanges()
SCR_virt4C_mm10_core = liftOver(GRanges(SCR_virt4C_mm39_GR_core), chain_mm39_to_mm10) %>% unlist()Plot virtual 4C track
virt_4c_plot_core =
as_tibble(SCR_virt4C_mm10_core) %>%
mutate(mid_interval = (start + end)/2) %>%
select(-mid) %>%
ggplot(aes(x = mid_interval, y = WT)) +
# annotate enhancer
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown) +
annotate("rect", xmin = start(enhancer), xmax = end(enhancer),
ymin = -Inf, ymax = Inf, alpha = .2, fill = brown) +
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene),
ymin = -Inf, ymax = Inf, alpha = .2, fill = 'red') +
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
# annotate CTCFs
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey', linetype = 'dotted') +
geom_col(fill = 'grey30', col = 'grey30') +
scale_x_continuous(
breaks = scales::pretty_breaks(n = 6),
labels = scales::unit_format(scale = 1E-6, accuracy = 0.05, unit=NULL),
limits = prange_zoom_paper,
expand = c(0,0)) +
labs(x='Genomic coordinate (Mb)',y='contact with SCR') +
theme_classic(base_size = 16) +
coord_cartesian(ylim = c(0, 2000), expand = F) +
theme(legend.position = 'none') +
geom_rug(data = tibble(pos = c(start(core_SCR_mm10), end(core_SCR_mm10))),
inherit.aes = F, aes(x = pos), col = 'gold')
cowplot::plot_grid(plotlist = list(expr_plot_23_34A, expr_plot_C9, virt_4c_plot_core),
ncol = 1, align="v", axis="lr")bw_select_chr3_oi = BigWigSelection(ranges = GRanges("chr3:34E6-35E6"), colnames = "score")
prange_zoom_gr = GRanges(seqnames = "chr3", IRanges(start = prange_zoom[1],
end = prange_zoom[2]))
#load raw datasets
bw_raw_ls = lapply(paths_ext_data_files, function(X){
rtracklayer::import.bw(X, selection = bw_select_chr3_oi)
})
names(bw_raw_ls) = names(paths_ext_data_files)
#convert to 10bp bins
bw_10bp_ls = lapply(bw_raw_ls, function(X){
BinAv_on_ROI(data_grange = X, region_oi_gr = prange_zoom_gr, binsize = 10)
})
names(bw_10bp_ls) = names(paths_ext_data_files)
#to set colors by group: make a vector with the groups as elements and the names of the datasets as names
col_groups = c("ATAC", "H3K27ac", "CTCF")
names(col_groups) = names(paths_ext_data_files)
#smoothen by 500bp (50*10bp)
plot_10bp_10smooth_ls = lapply(names(bw_10bp_ls), function(X){
input_tib_X = dplyr::rename(as_tibble(bw_10bp_ls[[X]]), !!X := score_binned) #"!! X := " treats the X as a variable, instead of the string X
PlotDataTracksLight(input_tib = input_tib_X,
exp_names = c(X),
color_groups = col_groups,
plot_chr = "chr3", plot_start = prange_zoom_paper[1], plot_end = prange_zoom_paper[2],
color_list = c("darkgreen","green4", "grey20"),
# aspect_ratio = 1/10,
smooth = 50,
fix_yaxis = T,
lighten_negative = F,
raster = T,
squish = T,
plot_y0_line = F) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.ticks.x = element_blank())
})
names(plot_10bp_10smooth_ls) = names(paths_ext_data_files)Calculate binned expression score
# 1kb resolution
exp_score_mm10_bins = expr_score_function(tib_filtered_mTurq_vals,
binsize = 1000,
N_bins_window = 1,
calculation_window = c(min(start(SCR_virt4C_mm10_core)), max(end(SCR_virt4C_mm10_core))), #c(34590000, 34830000),
CL = c("23_34A", "C9_mCh_34"), EXP = all_exp)contact_expr_tib_core = left_join(exp_score_mm10_bins, as_tibble(SCR_virt4C_mm10_core), by = join_by("start_interval" == "start")) %>%
mutate( WT = replace_na(WT, 0))
ggplot(filter(contact_expr_tib_core,
#use -161kb launch pad
cell_line == "C9_mCh_34" &
# #exclude the bins on the SCR),
(start_interval < start(core_SCR_mm10)-1000 | start_interval > end(core_SCR_mm10)) &
!is.na(total_ints_bin) & total_ints_bin >= 3),
aes(x = WT, y = expr_score_window)) +
#data
geom_smooth(method = "lm", col = 'black') +
geom_point(alpha = 0.5, col = '#0077BB', size = 2.5) +
stat_cor(method = "spearman",
label.x.npc = "right", hjust = 1,
label.y.npc = 'bottom') +
#layout:
scale_x_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.05))) +
scale_y_continuous(limits = c(0, NA), expand = expansion(mult = c(0, 0.05))) +
labs(x = "contact with SCR", y = "expression score bin") +
# geom_smooth(method = 'lm') +
theme_classic(base_size = 14) +
theme(aspect.ratio = 1)Calculate (per replicate) fraction of all P1, P2 etc integrations in each bin across the locus, using 5kb bins in the ‘prange_zoom_paper’. Plot these against each other (R1 vs R2, R1 vs R3, R2 vs R3)
bin_borders = seq(from = prange_zoom_paper[1], to = prange_zoom_paper[2], by = 5000)
bin_borders_tib = tibble(start_window = bin_borders) %>%
mutate(interval_number = row_number())
#count the number of integrations per cell line, population and bin
N_ints_pop_tib = tib_filtered_P1_strict %>%
filter(cell_line == "23_34A" & hopped) %>%
group_by(experiment, population) %>%
summarize(N_ints_pop = n())
N_ints_bins_tmp = tib_filtered_P1_strict %>%
filter(chr == "chr3" & cell_line == "23_34A" & hopped) %>%
mutate(interval_number = findInterval(start, bin_borders)) %>%
group_by(experiment, population, interval_number) %>%
summarize(N_ints_bin = n()) %>%
left_join(N_ints_pop_tib) %>%
mutate(fract_ints_bin = N_ints_bin/N_ints_pop)
frac_ints_bins_wide = N_ints_bins_tmp %>%
pivot_wider(id_cols = c(population, interval_number),
names_from = experiment,
values_from = fract_ints_bin,
values_fill = 0)
frac_ints_bins_wide_full =
#make all combinations of interval_number and population
expand_grid(interval_number = c(0, bin_borders_tib$interval_number),
population = unique(N_ints_bins_tmp$population)) %>%
#add the fractions and replace all NAs with zeros
left_join(frac_ints_bins_wide) %>%
mutate(across(everything(), ~replace_na(.x, 0)))
#plot pairs
filter(frac_ints_bins_wide_full,
population != 'ctrl' &
interval_number != c(0, max(interval_number))) %>% #remove the outer bins, which are bigger than 5kb
dplyr::rename(replicate_1 = E2138,
replicate_2 = E2282,
replicate_3 = E2285) %>%
ggpairs(aes(col = population),
columns = paste0('replicate_',1:3),
lower = list(continuous = "points", stroke = NA), #, mapping=aes(color=population
upper = list(continuous = wrap("cor", method = "spearman"))) +
theme_bw(base_size = 12)
I think it is best to use these arbitrary colors from R, because the
blue scale from the beeswarms has too similar colors for this type of
plot.
C9 is split into ‘replicates’, not experiments, otherwise same code as for 23_34A
#count the number of integrations per cell line, population and bin
N_ints_pop_tib = tib_filtered_P1_strict %>%
filter(cell_line == "C9_mCh_34" & hopped) %>%
group_by(replicate, population) %>%
summarize(N_ints_pop = n())
N_ints_bins_tmp = tib_filtered_P1_strict %>%
filter(chr == "chr3" & cell_line == "C9_mCh_34" & hopped) %>%
mutate(interval_number = findInterval(start, bin_borders)) %>%
group_by(replicate, population, interval_number) %>%
summarize(N_ints_bin = n()) %>%
left_join(N_ints_pop_tib) %>%
mutate(fract_ints_bin = N_ints_bin/N_ints_pop)
frac_ints_bins_wide = N_ints_bins_tmp %>%
pivot_wider(id_cols = c(population, interval_number),
names_from = replicate,
values_from = fract_ints_bin,
values_fill = 0)
frac_ints_bins_wide_full =
#make all combinations of interval_number and population
expand_grid(interval_number = c(0, bin_borders_tib$interval_number),
population = unique(N_ints_bins_tmp$population)) %>%
#add the fractions and replace all NAs with zeros
left_join(frac_ints_bins_wide) %>%
mutate(across(everything(), ~replace_na(.x, 0)))
#plot pairs
filter(frac_ints_bins_wide_full,
population != 'ctrl' &
interval_number != c(0, max(interval_number))) %>% #remove the outer bins, which are bigger than 5kb
dplyr::rename(replicate_1 = rep1,
replicate_2 = rep2) %>%
ggpairs(aes(col = population),
columns = paste0('replicate_',1:2),
lower = list(continuous = "points", stroke = NA), #, mapping=aes(color=population
upper = list(continuous = wrap("cor", method = "spearman"))) +
# coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.25)) +
theme_bw(base_size = 12) Calculate (bootstrapped) expression score 10kb
# set.seed(20240705)
# expr_score_tib_str = create_bootstrap_tibble(FUN = function(TIB){
# expr_score_function_stranded(TIB, binsize = 1000,
# N_bins_window = 10,
# calculation_window = prange_zoom,
# CL = "23_34A", EXP = all_exp)},
# TIB = tib_filtered_mTurq_vals,
# N_resamples = N_bootstrap)
# saveRDS(expr_score_tib_str, "/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_23_34A_stranded_10kb.rds")
expr_score_tib_str = readRDS("/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_23_34A_stranded_10kb.rds")
expr_score_sum_tib_str = bootstrap_summary_fun_str(expr_score_tib_str,
score_column = "expr_score_window",
conf_int = 0.95)expr_plot_prange_zoom_str = ggplot(expr_score_sum_tib_str, aes(x = mid_interval, col = strand, fill = strand, linetype = strand)) +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_23), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')
) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower, ymax = expr_score_window_upper), alpha = 0.2, linewidth = NA) +
# geom_line( aes(y = expr_score_window_mean), col = 'orange') +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median)) +
#axes
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=prange_zoom_paper, expand=c(0,0)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=6),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'bottom') +
colScale2_blue +
fillScale2_blue
expr_plot_prange_zoom_strCalculate (bootstrapped) expression score
# set.seed(20240705)
# expr_score_tib_str_C9 = create_bootstrap_tibble(FUN = function(TIB){
# expr_score_function_stranded(TIB, binsize = 500,
# N_bins_window = 10,
# calculation_window = prange_zoom,
# CL = "C9_mCh_34", EXP = all_exp)},
# TIB = tib_filtered_mTurq_vals,
# N_resamples = N_bootstrap)
# saveRDS(expr_score_tib_str_C9, "/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_stranded_C9_mCh_34.rds")
expr_score_tib_str_C9 = readRDS("/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240705_expression_score_stranded_C9_mCh_34.rds")
expr_score_sum_tib_str_C9 = bootstrap_summary_fun_str(expr_score_tib_str_C9,
score_column = "expr_score_window",
conf_int = 0.95)expr_plot_prange_zoom_str_C9 = ggplot(expr_score_sum_tib_str_C9, aes(x = mid_interval, col = strand, fill = strand, linetype = strand)) +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_C9), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')
) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower, ymax = expr_score_window_upper), alpha = 0.2, linewidth = NA) +
# geom_line( aes(y = expr_score_window_mean), col = 'orange') +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median)) +
#axes
scale_x_continuous(breaks = scales::pretty_breaks(n = 6), labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits=prange_zoom_paper, expand=c(0,0)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=6),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'bottom') +
colScale2_blue +
fillScale2_blue
expr_plot_prange_zoom_str_C9Load E2555, for C9_mCh_34 gates and comparison of untreated C9_mCh_34, 23_34A and F121/9, WT. ## load FACSDiva file E2555
library(openCyto)
library(CytoML)
library(flowWorkspace)
diva_ws <- open_diva_xml(diva_xml_file_E2555_sorting)
gs <- diva_to_gatingset(diva_ws,
name = 1, #the group to import
path = path_FACS_sorting_E2555,
#swapped columns are a known diva 'bug', for some files you need to 'unswap' them, default is T (as necessary for this file)
# swap_cols = F,
worksheet = "global",
verbose = F,
execute = T)
pdata_tib = pData(gs) %>%
as_tibble() %>%
mutate(name_short = str_remove(str_remove(name, "ME2024050._E2534_rep._"), "_([0-9]){3}.fcs"),
replicate = str_extract(name_short, "rep.$"),
cell_line = case_when(grepl("WT", name_short) ~ "WT",
grepl("F121_9", name_short) ~ "F121_9",
grepl("23_34A", name_short) ~ "23_34A",
.default = str_extract(name_short, "C9_.._mCh.")),
treatment = case_when(grepl("PB", name_short) ~ "PB",
grepl("SB", name_short) ~ "SB",
.default = "untreated"),
recording = case_when(grepl("_SORT", name_short) ~ "sorting",
.default = "pre-recording"),
sample_type = case_when(cell_line %in% c("WT", "F121_9", "23_34A") | treatment == "untreated" ~ "control",
.default = "sample")
) %>%
mutate(construct = case_when(cell_line %in% c("WT", "F121_9") ~ "none",
cell_line == c("23_34A") ~ "34",
.default = str_extract(cell_line, "([0-9]){2}")),
mCh_status = case_when(cell_line == "WT" ~ "mCh-",
cell_line %in% c("23_34A", "F121_9") ~ "mCh+",
.default = str_extract(cell_line, "mCh.")))
pdata_df = as.data.frame(pdata_tib)
rownames(pdata_df) = pdata_df$name
pData(gs) = pdata_dfThe gating set, ggcyto and cowplot don’t work together nicely. Fortifying to a flowSet doesn’t help, because then I can’t inverse transform the axis. So instead I just extract all the data as a tibble and plot those.
gs_oi_ctrls = subset(gs, sample_type == "control" & cell_line %in% c("WT", "F121_9", "23_34A", "C9_34_mCh+")) #combine pre-recording & sorting to have as much data as possible
colscale_34 = scale_colour_manual(values = c(`34` = '#0077BB', none = 'grey60'), aesthetics = c("fill", "colour"))
tib_expr_val_ls = lapply(1:4, function(cl){
data_pop = gh_pop_get_data(gs_oi_ctrls[[cl]], "P4", inverse.transform = T)
as_tibble(flowCore::exprs(data_pop))
})
names(tib_expr_val_ls) = c(pData(gs_oi_ctrls)$name)
tib_expr_val = bind_rows(tib_expr_val_ls, .id = "name")
tib_expr_val = left_join(tib_expr_val, pData(gs_oi_ctrls))
untr_GFP = ggplot(tib_expr_val, aes(x = `BL[B] 530/30-A`, fill = construct)) +
geom_density_ridges(aes(y = factor(cell_line,
levels = c("WT", "F121_9", "23_34A", "C9_34_mCh+"))),
linewidth = 0.5,
scale = 1.5,
alpha = 1,
quantile_lines = T, quantiles = 2) +
theme_bw() +
ggcyto::scale_x_flowjo_biexp(widthBasis = -100, breaks = c(-10^3, 0, 10^3, 10^4, 10^5),
pos = 4.42) +
theme(legend.position = 'none') +
labs(y = "cell line", x = "Sox2::eGFP")+
colscale_34
untr_mCh = ggplot(tib_expr_val, aes(x = `YG[D] 610/20-A`, fill = construct)) +
geom_density_ridges(aes(y = factor(cell_line,
levels = c("WT", "F121_9", "23_34A", "C9_34_mCh+"))),
linewidth = 0.5,
scale = 1.5,
alpha = 1,
quantile_lines = T, quantiles = 2) +
theme_bw() +
ggcyto::scale_x_flowjo_biexp(widthBasis = -100, breaks = c(-10^3, 0, 10^3, 10^4, 10^5),
pos = 4.42) +
theme(legend.position = 'none') +
labs(y = "cell line", x = "Sox2::mCherry")+
colscale_34
untr_mT = ggplot(tib_expr_val, aes(x = `V[F] 450/50-A`, fill = construct)) +
geom_density_ridges(aes(y = factor(cell_line,
levels = c("WT", "F121_9", "23_34A", "C9_34_mCh+"))),
linewidth = 0.5,
scale = 1.5,
alpha = 1,
quantile_lines = T, quantiles = 2) +
theme_bw() +
ggcyto::scale_x_flowjo_biexp(widthBasis = -100, breaks = c(-10^3, 0, 10^3, 10^4, 10^5),
pos = 4.42) +
theme(legend.position = 'none') +
labs(y = "cell line", x = "reporter (mTurq)")+
colscale_34
cowplot::plot_grid(plotlist = list(untr_GFP, untr_mCh, untr_mT),
align = 'h', nrow = 1, axis = 'tb')gs_oi_mChpos = subset(gs, sample_type == "sample" & mCh_status == "mCh+" & cell_line == "C9_34_mCh+" &
(recording == "sorting" | treatment == "PB")) #plot the recording only, combining 2 data sets doesn't give the right percentages on the gates
ggcyto::ggcyto(gs_oi_mChpos, aes(x = "GFP", y = "mTurqoise2"), subset = "mCh_pos") +
#facet by cell line / treatment
facet_grid(cell_line ~ treatment) +
#plot cells
geom_hex(bins = 128, aes( fill = after_stat(ncount))) +
scale_fill_distiller(palette = 'Spectral') + #manually adding a fill scale also forces the scale to be linear again.
#plot_gates
ggcyto::geom_gate(paste0("P", 1:6, "_mCh+"), col = 'black') +
ggcyto::geom_stats(adjust = c(-0.3, 0.5), digits = 2, size = 3) +
#layout
theme_bw(base_size = 14) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
aspect.ratio = 1) +
ggtitle(element_blank())+
ggcyto::axis_x_inverse_trans() +
ggcyto::axis_y_inverse_trans() +
ggcyto::ggcyto_par_set(limits = list(x = c(1.2, 4), y = c(0, 4.2))) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0))+
ggcyto::labs_cyto(labels = "marker")diva_ws_E2096 <- open_diva_xml(diva_xml_file_E2096_sorting)
gs_E2096 <- diva_to_gatingset(diva_ws_E2096,
name = 1, #the group to import
path = path_FACS_sorting_E2096,
swap_cols = F, #apparently we don't need to swap the FSC/SSC-H/W here (this is a known diva 'bug', if you import the xml into flowjo the columns are swapped but apparently not here)
worksheet = "global",
verbose = F,
execute = T)
pdata_tib_E2096 = pData(gs_E2096) %>%
as_tibble() %>%
mutate(name_short = str_remove(str_remove(name, "Mathias (BvS) 20221117_mTurq_"), "_([0-9]){3}.fcs"),
cell_line = case_when(grepl("neg_ctrl", name_short) ~ "WT",
grepl("F121,2f,9 parental", name_short) ~ "F121_9",
grepl("23_34", name_short) ~ "23_34A",
grepl("23_A1_34", name_short) ~ "A1_34"),
treatment = case_when(grepl("PB", name_short) ~ "PB",
grepl("SB", name_short) ~ "SB",
.default = "untreated"),
recording = case_when(grepl("_sort", name_short) ~ "sorting",
.default = "pre-recording"),
sample_type = case_when(cell_line %in% c("WT", "F121_9") | treatment == "untreated" ~ "control",
.default = "sample")
)
pdata_tib_E2096_df = as.data.frame(pdata_tib_E2096)
rownames(pdata_tib_E2096_df) = pdata_tib_E2096_df$name
pData(gs_E2096) = pdata_tib_E2096_dfgs_oi_E2096 = subset(gs_E2096, sample_type == "sample" & cell_line == "23_34A" &
(recording == "sorting" | treatment == "PB")) #recording only, combining 2 data sets doesn't give the right percentages on the gates
ggcyto::ggcyto(gs_oi_E2096, aes(x = "GFP", y = "mTurqoise2"), subset = "P4") + #P4 is just the single cells, don't confuse with 'gate4'
#facet by cell line / treatment
facet_grid(cell_line ~ treatment) +
#plot cells
geom_hex(bins = 128, aes( fill = after_stat(ncount))) +
scale_fill_distiller(palette = 'Spectral') + #manually adding a fill scale also forces the scale to be linear again.
#plot_gates
ggcyto::geom_gate(paste0("gate", 1:6), col = 'black') +
ggcyto::geom_stats(adjust = c(-0.3, 0.5), digits = 2, size = 3) +
#layout
theme_bw(base_size = 14) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
legend.position = "none",
aspect.ratio = 1) +
ggtitle(element_blank())+
ggcyto::axis_x_inverse_trans() +
ggcyto::axis_y_inverse_trans() +
ggcyto::ggcyto_par_set(limits = list(x = c(1.2, 4), y = c(0, 4.2))) +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0))+
ggcyto::labs_cyto(labels = "marker")We only use the insertion clones, so only use those
#load the insertion data & the link with the wells
top_insertion_clones = readRDS(path_insertion_clones_E2221)
link_well_clonename = read_tsv(path_link_well_clonename_E2221)
## Link location - FACS data
link_well_clonename_tib = link_well_clonename %>%
mutate(sample = paste0("clone_", `clone name`,"_34"),
plate_name =split_string(Plate_well_new, "_", 1),
plate_number = case_when(plate_name == "plate1" ~ 1,
plate_name == "plate25" ~ 2,
plate_name == "plate3645" ~ 3)
) %>%
dplyr::rename('well_ID'= `Well index culture plate`) %>%
rename(clone_name = `clone name`)
linked_clones = left_join(top_insertion_clones, link_well_clonename_tib) Load all FCS files
all_fcs_files = dir(path_fcs_files_E2221, full.names = T)
names(all_fcs_files) = split_string(str_remove(all_fcs_files, '.*/export_'), "_", 1,2)
fcs_files_tib = tibble(filename = all_fcs_files,
Plate_well_new = names(all_fcs_files))
fcs_files_sel = left_join(linked_clones, fcs_files_tib) %>%
filter(!is.na(filename)) %>%#remove the clone we didn't map
mutate(clone_name = paste0("clone_", clone_name))
#make dataframe
fcs_files_sel_df = data.frame(fcs_files_sel)
rownames(fcs_files_sel_df) = fcs_files_sel_df$clone_name
#load data
flowset_plates = read.flowSet(files = fcs_files_sel_df$filename, alter.names = T, truncate_max_range = F)
flowCore::sampleNames(flowset_plates) = fcs_files_sel_df$clone_name
flowCore::pData(flowset_plates) = fcs_files_sel_df
flowset_fluo = flowset_plates[,c('BL.B..530_30.A', 'YG.D..610_20.A', 'V.F..450_50.A')]
colnames(flowset_fluo) = c("GFP", "mCherry", "mTurq")calculate median
#calculate central tendency
median_exprs = t(sapply(1:length(flowset_fluo), function(i){
apply(exprs(flowset_fluo[[i]]),2,median)
}))
rownames(median_exprs) <- sampleNames(flowset_fluo)
colnames(median_exprs) <- paste0("median_", colnames(exprs(flowset_fluo[[1]])))
#pull out unhopped clones
unhopped_clones = fcs_files_sel %>% filter(start == 34643961) %>% pull(clone_name)
median_exprs[unhopped_clones,]## median_GFP median_mCherry median_mTurq
## clone_2-85 6487.84 13071.81 784.8
## clone_5-67 5602.24 11791.68 637.6
We don’t have a WT control for this clonal data. The unhopped reporters from the clonal data (measured on 20230228) are most similar to the measurements from E2250 (date 20230418), so we will use the WT control from that date: ME20230418_E2250_pools_WT
flowset_WT_ctrl_E2250 = flowCore::read.flowSet(files = path_WT_ctrl_E2250, alter.names = T, truncate_max_range = F)[,c('BL.B..530_30.A', 'YG.D..610_20.A', 'V.F..450_50.A', 'FSC.A', 'SSC.A')]
colnames(flowset_WT_ctrl_E2250) = c("GFP", "mCherry", "mTurq",'FSC', 'SSC')
autofluo_est = tibble(experiment = "E2250",
date = "20230418",
GFP_WT = median(exprs(flowset_WT_ctrl_E2250[[1]])[,'GFP']),
mCherry_WT = median(exprs(flowset_WT_ctrl_E2250[[1]])[,'mCherry']),
mTurq_WT = median(exprs(flowset_WT_ctrl_E2250[[1]])[,'mTurq']))median_exprs_tib = as_tibble(median_exprs) %>%
mutate(clone_name = rownames(median_exprs))
fcs_tib_clones = fcs_files_sel %>%
left_join(median_exprs_tib) %>%
mutate(hopped = start != 34643961) %>%
mutate(median_mTurq_cor = median_mTurq - autofluo_est$mTurq_WT,
median_mCherry_cor = median_mCherry - autofluo_est$mCherry_WT,
median_GFP_cor = median_GFP - autofluo_est$GFP_WT) %>%
mutate(mTurq_norm = median_mTurq_cor / median_GFP_cor,
mCherry_norm = median_mCherry_cor / median_GFP_cor)
#normalize to 23_34A (to compare between measurement days)
standard_tib = fcs_tib_clones %>%
filter(! hopped) %>%
group_by(start) %>%
summarize(
#GFP normalized
mTurq_norm_standard = mean(mTurq_norm),
mCherry_norm_standard = mean(mCherry_norm),
#not GFP normalized
GFP_standard = mean(median_GFP_cor),
mCherry_standard = mean(median_mCherry_cor))
expression_tib_cor = fcs_tib_clones %>%
mutate(
#GFP normalized
rel_norm_mTurq = mTurq_norm / standard_tib$mTurq_norm_standard,
rel_norm_mCherry = mCherry_norm / standard_tib$mCherry_norm_standard,
#not GFP normalized
rel_GFP_cor = median_GFP_cor / standard_tib$GFP_standard,
rel_mCherry_cor = median_mCherry_cor / standard_tib$mCherry_standard
)expression_tib_cor = expression_tib_cor %>%
#group the regions
mutate(region = case_when(start <= 34.66E6 & start >= 34.64E6 ~ "around Sox2",
start <= 34.9E6 & start >= prange_SCR[1] ~ "around SCR")) %>%
mutate(region = factor(region, levels = c("around Sox2", "around SCR")))Compute the geom_smooth from the mean per location (so the smooth matches the points actually in the plot). That is computed here:
ins_for_plot = filter(expression_tib_cor, start >= prange_zoom_paper[1] & start <= prange_zoom_paper[2] & chr == "chr3" & observed_clonetype_mapping == "insertion")
ins_for_plot_mean = ins_for_plot %>%
group_by(chr, start, strand, region) %>%
summarize(rel_norm_mTurq = mean(rel_norm_mTurq),
rel_norm_mCherry = mean(rel_norm_mCherry))
clones_norm_expr_facet_cor2 = ggplot(ins_for_plot_mean,
aes(x = start, y = rel_norm_mTurq)) +
facet_grid(~ region, space = "free_x", scales = "free_x") +
# # annotate enhancer
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown) +
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown) +
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf, alpha = .2, fill = 'red') +
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
#annotate landing pad
geom_vline(xintercept = start(landingPad_23), col = 'darkgrey') +
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted') +
#layout
labs(x = 'Genomic coordinate (Mb)',
y = 'relative mTurq') +
theme_classic(base_size = 16) +
facetted_pos_scales(
x = list(
region == "around Sox2" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.64E6, 34.66E6),
breaks = c(34.64E6,34.65E6, 34.66E6)),
region == "around SCR" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.735E6, 34.775E6))
)) +
scale_y_continuous(breaks = scales::pretty_breaks(n=6),
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05))) +
geom_hline(yintercept = 1, linetype = 'dashed')+
#data
geom_point(aes(shape = strand), col = 'black', size = 2) +
# stat_summary(fun = mean, col = 'black', aes(shape = strand)) +
#smooth
geom_smooth(data = filter(ins_for_plot_mean, region == "around SCR"), aes(x = start, y = rel_norm_mTurq), col = 'grey30')Annotate clones plot with expression score (smaller bin size) Plot for 23_34a
set.seed(20231012)
#calculate expression scores with a smaller bin size (sb)
# expr_score_tib_sb = create_bootstrap_tibble(FUN = function(TIB){expr_score_function(TIB,
# binsize = 500,
# N_bins_window = 10,
# calculation_window = prange_zoom,
# CL = "23_34A", EXP = all_exp)},
# TIB = tib_filtered_mTurq_vals,
# N_resamples = N_bootstrap)
# saveRDS(expr_score_tib_sb, "/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240408_expression_score_SCR.rds")
expr_score_tib_sb_23_34A = readRDS("/DATA/projects/Sox2/Figure_reporter_only_hopping/rds_files/CM20240408_expression_score_SCR.rds")
expr_score_sum_tib_sb_23_34A = bootstrap_summary_fun(expr_score_tib_sb_23_34A,
score_column = "expr_score_window",
conf_int = 0.95)
#plot regions we have clones for (in facets)
expr_score_sum_tib_sb_23_34A_facet = expr_score_sum_tib_sb_23_34A %>%
mutate(region = case_when(mid_interval <= 34.66E6 & mid_interval >= 34.64E6 ~ "around Sox2",
mid_interval <= 34.9E6 & mid_interval >= prange_SCR[1] ~ "around SCR")) %>%
mutate(region = factor(region, levels = c("around Sox2", "around SCR"))) %>%
filter(!is.na(region))
expr_score_region_clones_23_34A = ggplot(expr_score_sum_tib_sb_23_34A_facet, aes(x = mid_interval)) +
facet_grid(. ~ region, space = "free_x", scales = "free_x") +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_23), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower, ymax = expr_score_window_upper), alpha = 0.2) +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median)) +
#axes
facetted_pos_scales(
x = list(
region == "around Sox2" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.64E6, 34.66E6),
breaks = c(34.64E6,34.65E6, 34.66E6)),
region == "around SCR" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
# limits = c(start(SCR_50kb), end(SCR_50kb)))
limits = c(34.735E6, 34.775E6))
)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=4),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'bottom') Plot for C9 (no need to calculate new expression score, is already on 5kb window (500bp bins))
#plot regions we have clones for (in facets)
expr_score_sum_tib_sb_C9_facet = expr_score_sum_tib_C9 %>%
mutate(region = case_when(mid_interval <= 34.66E6 & mid_interval >= 34.64E6 ~ "around Sox2",
mid_interval <= 34.9E6 & mid_interval >= prange_SCR[1] ~ "around SCR")) %>%
mutate(region = factor(region, levels = c("around Sox2", "around SCR"))) %>%
filter(!is.na(region))
expr_score_region_clones_C9 = ggplot(expr_score_sum_tib_sb_C9_facet, aes(x = mid_interval)) +
facet_grid(. ~ region, space = "free_x", scales = "free_x") +
# annotate enhancer
list(
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown),
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown),
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf,alpha = .2, fill = 'red'),
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red'),
geom_vline(xintercept = start(landingPad_C9), col = 'darkgrey'),
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted')) +
## plot the confidence interval based on the resampled data
geom_ribbon(aes(ymin = expr_score_window_lower, ymax = expr_score_window_upper), alpha = 0.2) +
## plot the median based on the resampled data
geom_line(aes(y = expr_score_window_median)) +
#axes
facetted_pos_scales(
x = list(
region == "around Sox2" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.64E6, 34.66E6),
breaks = c(34.64E6,34.65E6, 34.66E6)),
region == "around SCR" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
# limits = c(start(SCR_50kb), end(SCR_50kb)))
limits = c(34.735E6, 34.775E6))
)) +
scale_y_continuous(expand=expansion(mult = c(0, 0.05)),
breaks = scales::pretty_breaks(n=4),
limits = c(0, NA)) +
labs(x='Genomic coordinate (Mb)',y='expression score') +
theme_classic(base_size = 16) +
theme(legend.position = 'bottom') expr_score_region_clones_23_34A_noX = expr_score_region_clones_23_34A +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank()
)
expr_score_region_clones_C9_noX = expr_score_region_clones_C9 +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank()
)
clones_expr_score_comb = cowplot::plot_grid(expr_score_region_clones_23_34A_noX,
expr_score_region_clones_C9_noX,
clones_norm_expr_facet_cor2,
ncol = 1, align="v", axis="lr", rel_heights = c(0.9, 0.9, 1.1)) #, rel_heights = c(90, 110))
clones_expr_score_combIn figure 5 we need 1 panel based on the clones, I generate it here so we don’t have to copy so much code.
clones_norm_expr_mCh_facet_cor = ggplot(ins_for_plot_mean,
aes(x = start, y = rel_norm_mCherry)) +
facet_grid(~ region, space = "free_x", scales = "free_x") +
# # annotate enhancer
annotate("rect", xmin = start(enhancer), xmax = end(enhancer), ymin = -Inf, ymax = Inf, alpha = .2, fill = brown) +
geom_vline(xintercept = c(start(enhancer), end(enhancer)), col = brown) +
# annotate gene
annotate("rect", xmin = start(Sox2_gene), xmax = end(Sox2_gene), ymin = -Inf, ymax = Inf, alpha = .2, fill = 'red') +
geom_vline(xintercept = c(start(Sox2_gene), end(Sox2_gene)), col = 'red') +
#annotate landing pad
geom_vline(xintercept = start(landingPad_23), col = 'darkgrey') +
geom_vline(xintercept = start(CTCF_mm10.chr3_extra), col = 'darkgrey',linetype = 'dotted') +
#layout
labs(x = 'Genomic coordinate (Mb)',
y = 'relative mCherry') +
theme_classic(base_size = 16) +
facetted_pos_scales(
x = list(
region == "around Sox2" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.64E6, 34.66E6),
breaks = c(34.64E6,34.65E6, 34.66E6)),
region == "around SCR" ~ scale_x_continuous(labels = scales::unit_format(scale = 1E-6, accuracy = 0.01, unit=NULL),
limits = c(34.735E6, 34.775E6))
)) +
scale_y_continuous(breaks = scales::pretty_breaks(n=6),
limits = c(0, NA),
expand = expansion(mult = c(0, 0.05))) +
geom_hline(yintercept = 1, linetype = 'dashed')+
#data
# geom_point(aes(shape = strand, text = sample, text2 = Plate_well_new), col = 'darkgrey', stroke = NA) +
stat_summary(fun = mean, col = 'black', aes(shape = strand)) +
#smooth
geom_smooth(data = filter(ins_for_plot, region == "around SCR"), aes(x = start, y = rel_norm_mCherry), col = 'grey30')clones_norm_expr_facet_cor_noX = clones_norm_expr_facet_cor2 +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank()
)
clones_mT_mCh_comb= cowplot::plot_grid(clones_norm_expr_facet_cor_noX, clones_norm_expr_mCh_facet_cor,
ncol = 1, align="v", axis="lr", rel_heights = c(90, 110))
clones_mT_mCh_combMinimum level of mCherry (according to the smoothened data) is 0.75: